#Multi script
import snowcrawl, time, re

def containsScrapbook( text, params ):
	#Searches for pages containing the word 'scrapbook'
	start_time = time.time()
	tokens = re.findall( '\w+', text )

	found_scrapbook = False
	for t in tokens:
		if re.match( 'scrapbook', t.lower() ):
			found_scrapbook = True
	
	end_time = time.time()
	return ( int(found_scrapbook), [start_time, end_time, len(text), len(tokens)] )

def myUrlProcesser( (url, params) ):
	(text, download_stats) = snowcrawl.downloadUrl( 'http://'+url )
	(kept, classify_stats) = containsScrapbook( text, params )
	(edges, edge_stats) = snowcrawl.findEdges( url, text )
	if kept:
		return (kept, text, edges, edges, download_stats + classify_stats + edge_stats)
	else:
		return (kept, text, [], edges, download_stats + classify_stats + edge_stats)

def max10K(self):
	return self.urls_completed > 10000

if __name__ == '__main__':
	my_path = 'results2/'

	my_params = snowcrawl.Parameters(
		time_inc = 5,
		time_limit = 5,
		wave_size = 500,
		save_states=False, save_files=False, save_edges=False,
		prioritize_urls = True,
		csv_header = ["download_start_time", "download_end_time", "length",
			"classify_start_time", "classify_end_time", "characters", "tokens",
			"edge_start_time", "edge_end_time", "out_degree", "self_loops"
		],
		process_url_function = myUrlProcesser,
		decide_terminate_function = max10K
	)
	my_seed_list = ['www.dmoz.org/Arts/Crafts/Scrapbooking/']

	my_crawl = snowcrawl.MultiCrawler( pool_size=20 )
	my_crawl.runUntilDone( my_path, my_params, seed_list=my_seed_list, overwrite_existing_files=True )
